{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import csv\n",
    " \n",
    "def loadCsv(filename):\n",
    "    lines = csv.reader(open(filename, \"r\"))\n",
    "    dataset = list(lines)\n",
    "    for i in range(len(dataset)):\n",
    "        dataset[i] = [x for x in dataset[i]]\n",
    "    return dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def convertCatToNum(dataset):\n",
    "    for row in dataset:\n",
    "        if (row[0] == 'vhigh'): row[0] = 1;\n",
    "        elif (row[0] == 'high'): row[0] = 2;\n",
    "        elif (row[0] == 'med'): row[0] = 3;\n",
    "        elif (row[0] == 'low'): row[0] = 4;\n",
    "        \n",
    "        if (row[1] == 'vhigh'): row[1] = 1;\n",
    "        elif (row[1] == 'high'): row[1] = 2;\n",
    "        elif (row[1] == 'med'): row[1] = 3;\n",
    "        elif (row[1] == 'low'): row[1] = 4;\n",
    "        \n",
    "        if (row[2] == '2'): row[2] = 2;\n",
    "        elif (row[2] == '3'): row[2] = 3;\n",
    "        elif (row[2] == '4'): row[2] = 4;\n",
    "        elif (row[2] == '5'): row[2] = 5;\n",
    "        elif (row[2] == 'more' or row[2] == '5more'): row[2] = 6;\n",
    "        \n",
    "        if (row[3] == '2'): row[3] = 2;\n",
    "        elif (row[3] == '4'): row[3] = 4;\n",
    "        elif (row[3] == 'more'): row[3] = 5;\n",
    "        \n",
    "        if (row[4] == 'small'): row[4] = 1;\n",
    "        elif (row[4] == 'med'): row[4] = 2;\n",
    "        elif (row[4] == 'big'): row[4] = 3;\n",
    "        \n",
    "        if (row[5] == 'low'): row[5] = 1;\n",
    "        elif (row[5] == 'med'): row[5] = 2;\n",
    "        elif (row[5] == 'high'): row[5] = 3;\n",
    "        \n",
    "        if (row[6] == 'unacc'): row[6] = 1;\n",
    "        elif (row[6] == 'acc'): row[6] = 2;\n",
    "        elif (row[6] == 'good'): row[6] = 3;\n",
    "        elif (row[6] == 'vgood'): row[6] = 4;"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import random\n",
    "\n",
    "def splitDataset(dataset, splitRatio):\n",
    "    trainSize = int(len(dataset) * splitRatio)\n",
    "    trainSet = []\n",
    "    copy = list(dataset)\n",
    "    while len(trainSet) < trainSize:\n",
    "        index = random.randrange(len(copy))\n",
    "        trainSet.append(copy.pop(index))\n",
    "    return [trainSet, copy]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def separateByClass(dataset):\n",
    "    separated = {}\n",
    "    for i in range(len(dataset)):\n",
    "        vector = dataset[i]\n",
    "        if (vector[-1] not in separated):\n",
    "            separated[vector[-1]] = []\n",
    "        separated[vector[-1]].append(vector)\n",
    "    return separated"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import math\n",
    "\n",
    "def mean(numbers):\n",
    "    return sum(numbers)/float(len(numbers))\n",
    " \n",
    "def stdev(numbers):\n",
    "    avg = mean(numbers)\n",
    "    variance = sum([pow(x-avg,2) for x in numbers])/float(len(numbers)-1)\n",
    "    return math.sqrt(variance)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def summarize(dataset):\n",
    "    summaries = [(mean(attribute), stdev(attribute)) for attribute in zip(*dataset)]\n",
    "    del summaries[-1]\n",
    "    return summaries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def summarizeByClass(dataset):\n",
    "    separated = separateByClass(dataset)\n",
    "    summaries = {}\n",
    "    for classValue, instances in separated.items():\n",
    "        summaries[classValue] = summarize(instances)\n",
    "    return summaries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def calculateProbability(x, mean, stdev):\n",
    "    \n",
    "    if stdev == 0:\n",
    "        return 1. if x == mean else .1\n",
    "    \n",
    "    exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(stdev,2))))\n",
    "    return (1 / (math.sqrt(2*math.pi) * math.pow(stdev, 2))) * exponent"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def calculateClassProbabilities(summaries, inputVector):\n",
    "    probabilities = {}\n",
    "    for classValue, classSummaries in summaries.items():\n",
    "        probabilities[classValue] = 1\n",
    "        for i in range(len(classSummaries)):\n",
    "            mean, stdev = classSummaries[i]\n",
    "            x = inputVector[i]\n",
    "            probabilities[classValue] *= calculateProbability(x, mean, stdev)\n",
    "    return probabilities"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def predict(summaries, inputVector):\n",
    "    probabilities = calculateClassProbabilities(summaries, inputVector)\n",
    "    bestLabel, bestProb = None, -1\n",
    "    for classValue, probability in probabilities.items():\n",
    "        if bestLabel is None or probability > bestProb:\n",
    "            bestProb = probability\n",
    "            bestLabel = classValue\n",
    "    return bestLabel"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "carDS = loadCsv(\"carData.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "convertCatToNum(carDS)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "splittedCarDS = splitDataset(carDS, 0.70)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "trainSet = splittedCarDS[0]\n",
    "testSet = splittedCarDS[1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "summary = summarizeByClass(trainSet)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def getPredictions(summaries, testSet):\n",
    "    predictions = []\n",
    "    for i in range(len(testSet)):\n",
    "        result = predict(summaries, testSet[i])\n",
    "        predictions.append(result)\n",
    "    return predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def getAccuracy(testSet, predictions):\n",
    "    correct = 0\n",
    "    for i in range(len(testSet)):\n",
    "        if testSet[i][-1] == predictions[i]:\n",
    "            correct += 1\n",
    "    return (correct/float(len(testSet))) * 100.0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "accs = []\n",
    "\n",
    "for i in range(100):\n",
    "    predictions = getPredictions(summary, testSet)\n",
    "    accuracy = getAccuracy(testSet, predictions)\n",
    "    accs.append(accuracy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Em 100 iterações, a melhor acurácia do NaiveBayes implementado foi: 70.32755298651252\n"
     ]
    }
   ],
   "source": [
    "maxAccuracy = max(accs)\n",
    "\n",
    "print('Em 100 iterações, a melhor acurácia do NaiveBayes implementado foi: {0}'.\n",
    "      format(maxAccuracy))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import GaussianNB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "nb = GaussianNB()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train = []\n",
    "X_test = []\n",
    "y_train = []\n",
    "y_test = []\n",
    "\n",
    "for i in range(len(trainSet)):\n",
    "    newList = list(trainSet[i])\n",
    "    del newList[-1]\n",
    "    X_train.append(newList)\n",
    "\n",
    "for i in range(len(trainSet)):\n",
    "    newList = list(trainSet[i])\n",
    "    y_train.append(newList[-1])\n",
    "    \n",
    "for i in range(len(testSet)):\n",
    "    newList = list(testSet[i])\n",
    "    del newList[-1]\n",
    "    X_test.append(newList)\n",
    "    \n",
    "for i in range(len(testSet)):\n",
    "    newList = list(testSet[i])\n",
    "    y_test.append(newList[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GaussianNB(priors=None)"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nb.fit(X_train, y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "accs = []\n",
    "\n",
    "for i in range(100):\n",
    "    predictions = nb.predict(X_test)\n",
    "    accuracy = getAccuracy(testSet, predictions)\n",
    "    accs.append(accuracy)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Em 100 iterações, a melhor acurácia do NaiveBayes do scikit-learn foi: 76.878612716763\n"
     ]
    }
   ],
   "source": [
    "maxAccuracy = max(accs)\n",
    "\n",
    "print('Em 100 iterações, a melhor acurácia do NaiveBayes do scikit-learn foi: {0}'.\n",
    "      format(maxAccuracy))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
